# Python libraries
# Classic,data manipulation and linear algebra
import pandas as pd
import numpy as np
# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
path = 'dataset/insurance.csv'
data = pd.read_csv(path)
data.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
data.tail().T
| 1333 | 1334 | 1335 | 1336 | 1337 | |
|---|---|---|---|---|---|
| age | 50 | 18 | 18 | 21 | 61 |
| sex | male | female | female | female | female |
| bmi | 30.97 | 31.92 | 36.85 | 25.8 | 29.07 |
| children | 3 | 0 | 0 | 0 | 0 |
| smoker | no | no | no | no | yes |
| region | northwest | northeast | southeast | southwest | northwest |
| charges | 10600.5483 | 2205.9808 | 1629.8335 | 2007.945 | 29141.3603 |
data.drop('region', axis=1, inplace=True)
data.isnull().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 charges 0 dtype: int64
data.shape
(1338, 6)
data.describe()
| age | bmi | children | charges | |
|---|---|---|---|---|
| count | 1338.000000 | 1338.000000 | 1338.000000 | 1338.000000 |
| mean | 39.207025 | 30.663397 | 1.094918 | 13270.422265 |
| std | 14.049960 | 6.098187 | 1.205493 | 12110.011237 |
| min | 18.000000 | 15.960000 | 0.000000 | 1121.873900 |
| 25% | 27.000000 | 26.296250 | 0.000000 | 4740.287150 |
| 50% | 39.000000 | 30.400000 | 1.000000 | 9382.033000 |
| 75% | 51.000000 | 34.693750 | 2.000000 | 16639.912515 |
| max | 64.000000 | 53.130000 | 5.000000 | 63770.428010 |
import plotly.express as px
for i in data.columns:
fig = px.histogram(data, x=i)
fig.update_layout(bargap=0.2,width=500, height=300)
fig.show()
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(2) memory usage: 62.8+ KB
data["sex"].unique()
array(['female', 'male'], dtype=object)
data["smoker"].unique()
array(['yes', 'no'], dtype=object)
data.to_csv("preprocessed.csv")
encoding = {"sex": {"male": 1, "female": 0},"smoker": {"yes": 1, "no": 0}}
df= data.replace(encoding)
df.head()
| age | sex | bmi | children | smoker | charges | |
|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | 3866.85520 |
import plotly.express as px
fig = px.scatter_matrix(df,dimensions=df.columns,
color="charges")
fig.update_layout(
title='Scatterplot Matrix',
dragmode='select',
width=800,
height=800,
hovermode='closest',
)
fig.show()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null int64 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null int64 5 charges 1338 non-null float64 dtypes: float64(2), int64(4) memory usage: 62.8 KB
# Correlation matrix
corrmat = df.corr()
fig = go.Figure(data = go.Heatmap( z = corrmat.values, x = list(corrmat.columns),y = list(corrmat.index),colorscale = 'Viridis'))
fig.update_layout(title = 'Correlation',width=500,height=500)
fig.show()
df.corr()['charges'].sort_values()
sex 0.057292 children 0.067998 bmi 0.198341 age 0.299008 smoker 0.787251 charges 1.000000 Name: charges, dtype: float64
sns.catplot(x="smoker", kind="count",hue = 'sex',palette = 'magma', data=data)
<seaborn.axisgrid.FacetGrid at 0x19115795be0>
sns.catplot(x="sex", y="charges", hue="smoker",kind="violin", data=data, palette = 'magma')
<seaborn.axisgrid.FacetGrid at 0x191119d2c40>
plt.figure(figsize=(12,5))
plt.title("Box plot for charges of women")
sns.boxplot(y="smoker", x="charges", data = df[(df.sex == 0)] , orient="h", palette = 'rainbow')
<AxesSubplot:title={'center':'Box plot for charges of women'}, xlabel='charges', ylabel='smoker'>
plt.figure(figsize=(12,5))
plt.title("Box plot for charges of men")
sns.boxplot(y="smoker", x="charges", data = df[(df.sex == 1)] , orient="h", palette = 'rainbow')
<AxesSubplot:title={'center':'Box plot for charges of men'}, xlabel='charges', ylabel='smoker'>
#"Distribution of age"
sns.displot(data["age"], color = 'g')
<seaborn.axisgrid.FacetGrid at 0x19117dc0070>
sns.catplot(x="smoker", kind="count",hue = 'sex', palette="rainbow", data=data[(data.age == 18)])
plt.title("The number of smokers and non-smokers (18 years old)")
Text(0.5, 1.0, 'The number of smokers and non-smokers (18 years old)')
plt.figure(figsize=(12,5))
plt.title("Box plot for charges 18 years old smokers")
sns.boxplot(y="smoker", x="charges", data = data[(data.age == 18)] , orient="h", palette = 'pink')
<AxesSubplot:title={'center':'Box plot for charges 18 years old smokers'}, xlabel='charges', ylabel='smoker'>
g = sns.jointplot(x="age", y="charges", data = df[(df.smoker == 0)],kind="kde", color="m")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
#'Distribution of charges and age for non-smokers'
<seaborn.axisgrid.JointGrid at 0x19117faddf0>
g = sns.jointplot(x="age", y="charges", data = df[(df.smoker == 1)],kind="kde", color="c")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
#'Distribution of charges and age for smokers'
<seaborn.axisgrid.JointGrid at 0x191191392b0>
sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'inferno_r', height = 7)
#Smokers and non-smokers'
<seaborn.axisgrid.FacetGrid at 0x19117f5a850>
#"Distribution of bmi"
ax = sns.displot(data["bmi"], color = 'm')
#"Distribution of charges for patients with BMI greater than 30"
sns.displot(data[(data.bmi >= 30)]['charges'], color = 'm')
<seaborn.axisgrid.FacetGrid at 0x1911933ca00>
#"Distribution of charges for patients with BMI less than 30"
sns.displot(data[(data.bmi < 30)]['charges'], color = 'b')
<seaborn.axisgrid.FacetGrid at 0x191192dc400>
g = sns.jointplot(x="bmi", y="charges", data = data,kind="kde", color="r")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
#ax.set_title('Distribution of bmi and charges')
<seaborn.axisgrid.JointGrid at 0x1911937bd90>
plt.figure(figsize=(10,6))
ax = sns.scatterplot(x='bmi',y='charges',data=data,palette='magma',hue='smoker')
ax.set_title('Scatter plot of charges and bmi')
sns.lmplot(x="bmi", y="charges", hue="smoker", data=data, palette = 'magma', height = 8)
<seaborn.axisgrid.FacetGrid at 0x191192d2af0>
sns.catplot(x="children", kind="count", palette="ch:.25", data=data, height = 6)
<seaborn.axisgrid.FacetGrid at 0x19119779a90>
sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
data=data[(data.children > 0)], height = 6)
#'Smokers and non-smokers who have childrens'
<seaborn.axisgrid.FacetGrid at 0x191193cb490>
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
X = df.drop(['charges'], axis = 1)
Y = df.charges
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=101)
print(X_train.shape)
print(X_test.shape)
(1070, 5) (268, 5)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
lr_train_acc = lr.score(X_train,Y_train)
print("Train Accuracy: "+ str(lr_train_acc))
lr_acc= lr.score(X_test,Y_test)
print("Test Accuracy: "+ str(lr_acc))
Train Accuracy: 0.7461111742852395 Test Accuracy: 0.7618254042736423
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 1000,criterion = 'squared_error',random_state = 100,n_jobs = -1,max_depth=5)
forest.fit(X_train,Y_train)
forest_train_pred = forest.predict(X_train)
forest_test_pred = forest.predict(X_test)
rfc_train_acc = forest.score(X_train,Y_train)
print("Train Accuracy: "+ str(rfc_train_acc))
rfc_acc= forest.score(X_test,Y_test)
print("Test Accuracy: "+ str(rfc_acc))
Train Accuracy: 0.8891494245729069 Test Accuracy: 0.8629064897819936
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(criterion = 'squared_error',max_depth=5)
dtr.fit(X_train,Y_train)
# Predict
dtr_train_pred = dtr.predict(X_train)
dtr_test_pred = dtr.predict(X_test)
dtr_train_acc = dtr.score(X_train,Y_train)
print("Train Accuracy: "+ str(dtr_train_acc))
dtr_acc= dtr.score(X_test,Y_test)
print("Test Accuracy: "+ str(dtr_acc))
Train Accuracy: 0.8802943823311056 Test Accuracy: 0.8618463447218458
from sklearn import linear_model
# Fit regression model
lassoReg = linear_model.Lasso(alpha=0.1)
lassoReg.fit(X_train,Y_train)
# Predict
lassoReg_train_pred = lassoReg.predict(X_train)
lassoReg_test_pred =lassoReg.predict(X_test)
lassoReg_train_acc = lassoReg.score(X_train,Y_train)
print("Train Accuracy: "+ str(lassoReg_train_acc))
lassoReg_acc= lassoReg.score(X_test,Y_test)
print("Test Accuracy: "+ str(lassoReg_acc))
Train Accuracy: 0.7461111736377029 Test Accuracy: 0.7618255389520554
plt.figure(figsize=(10,6))
plt.scatter(forest_train_pred,forest_train_pred - Y_train,
c = 'black', marker = 'o', s = 35, alpha = 0.5,
label = 'Train data')
plt.scatter(forest_test_pred,forest_test_pred - Y_test,
c = 'c', marker = 'o', s = 35, alpha = 0.7,
label = 'Test data')
plt.xlabel('Predicted values')
plt.ylabel('Tailings')
plt.legend(loc = 'upper left')
plt.hlines(y = 0, xmin = 0, xmax = 60000, lw = 2, color = 'red')
plt.show()
models = pd.DataFrame({
'Model': ['Linear Regression', 'Random Forest Regression','Decision Tree Regression', 'Lasso Regression'],
'Score': [lr_acc,rfc_acc,dtr_acc,lassoReg_acc]
})
models.sort_values(by = 'Score', ascending = False)
| Model | Score | |
|---|---|---|
| 1 | Random Forest Regression | 0.862906 |
| 2 | Decision Tree Regression | 0.861846 |
| 3 | Lasso Regression | 0.761826 |
| 0 | Linear Regression | 0.761825 |
models.to_csv(r'models2.csv',index=False)
fig = px.bar(models, x='Model', y='Score')
fig.update_layout(width=500,height=500)
fig.show()
# pickling the model
import pickle
import warnings
warnings.filterwarnings('ignore')
pickle_out = open("result.pkl", "wb")
pickle.dump(forest, pickle_out)
pickle_out.close()
pickle_in = open('result.pkl', 'rb')
rfr = pickle.load(pickle_in)
prediction = rfr.predict([[28,1,33,3,0]])
print(prediction)
[6332.10105162]
result = prediction * (75.62)
res = result[0]
round(res,2)
478833.48